Modeling the Joint Distribution of Wind Speed and Direction using Gaussain Mixture Models

OEN Method: Harris, Cook The parent wind speed distribution: Why Weibull? http://www.sciencedirect.com/science/article/pii/S0167610514001056

Gaussian Mixture Models, http://scikit-learn.org/stable/modules/mixture.html

1. Set up

1.1 Environment

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from import_file import *
from helpers.parallel_helper import *
load_libs()

plt.rcParams['axes.autolimit_mode'] = 'round_numbers'
plt.rcParams['axes.xmargin'] = 0.
plt.rcParams['axes.ymargin'] = 0.
mpl.rcParams['patch.force_edgecolor'] = True

1.2 Read Data

In [2]:
# Venezula
# file_path= './data/NCDC/arturo/dat.txt' # misdirection
# file_path= './data/NCDC/simon_bolivar_intl/dat.txt' # misdirection

# file_path, bandwidth= './data/NCDC/europe/uk/marham/dat.txt', 1.7
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/uk/tiree/dat.txt', 1.9, 4 
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NCDC/europe/uk/boscombe_down/dat.txt', 1.5, 4
# file_path, bandwidth= './data/NCDC/europe/uk/middle_wallop/dat.txt', 1.3
# file_path, bandwidth= './data/NCDC/europe/uk/bournemouth/dat.txt',1.3 # 4?
# file_path= "./data/NCDC/europe/uk/weybourne/dat.txt"
# file_path= "./data/NCDC/europe/uk/skye_lusa/dat.txt" # 
# file_path= "./data/NCDC/europe/uk/wattisham/dat.txt"
# file_path= "./data/NCDC/europe/uk/south_uist_range/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/holbeach/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/uk/cambridge/dat.txt" # inpropoer direction R square measure
# file_path= "./data/NCDC/europe/us/baltimore/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/bealach_na_ba/dat.txt" # time too short
# file_path= "./data/NCDC/europe/uk/benbecula/dat.txt" # truncate (untruncate in m/s), 4?
# file_path= './data/NCDC/europe/uk/southhamption/dat.txt' # high 0, trend

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = "./data/NCDC/europe/germany/landsberg_lech/dat.txt", 0.9, 4 
# file_path, bandwidth= "./data/NCDC/europe/germany/neuburg/dat.txt", 0.7
# file_path, bandwidth= "./data/NCDC/europe/germany/laupheim/dat.txt", 0.7 # double peak, 4?, trend
# file_path, bandwidth= './data/NCDC/europe/germany/niederstetten/dat.txt', 0.9 # get the peak
# file_path, bandwidth= "./data/NCDC/europe/germany/holzdorf/dat.txt", 0.9 # 2008 year
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/europe/france/nantes/dat.txt', 0.9, 4 # unit shift, one direction deviate big
# file_path= './data/NCDC/europe/france/pau_pyrenees/dat.txt' # unit shift, 2; force using knot 
# file_path= "./data/NCDC/europe/france/avord/dat.txt" # try 4, initial speed (should be good with m/s), incompete dataset
# file_path= "./data/NCDC/europe/france/vatry/dat.txt"  # double peak, initial speed, incompete dataset
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= "./data/NCDC/europe/spain/valladolid/dat.txt", 1.1, 4
# file_path= './data/NCDC/europe/spain/jerez/dat.txt' # high 0
# file_path, bandwidth= "./data/NCDC/europe/spain/barayas/dat.txt", 0.7 # not good fit
# file_path, bandwidth= './data/NCDC/europe/spain/malaga/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/tenerife_sur/dat.txt', 0.7 # directions blocked?
# file_path, bandwidth= './data/NCDC/europe/spain/almeria/dat.txt', 0.7 # negative dimensions?
# file_path, bandwidth= './data/NCDC/europe/greece/eleftherios_intl/dat.txt',0.7 # some direction might be blocked
# file_path= './data/NCDC/europe/ciampino/dat.txt' # try 4, bandwidth?
# file_path= "./data/NCDC/europe/huspel_aws/dat.txt"  # integer, 4?
# file_path= './data/NCDC/gibraltar/dat.txt' # bad fit

# MidEast
# file_path, bandwidth= './data/NCDC/mideast/uae/al_maktoum/dat.txt', 1.1
# file_path= './data/NCDC/mideast/uae/sharjah_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/dubai_intl/dat.txt' 
# file_path= './data/NCDC/mideast/uae/abu_dhabi_intl/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/uae/bateen/dat.txt' # Time shift
# file_path= './data/NCDC/mideast/buraimi/dat.txt' # not good dataset
# file_path= './data/NCDC/mideast/turkey/konya/dat.txt' 
# file_path= './data/NCDC/mideast/turkey/sivas/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/balikesir/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/turkey/bartin/dat.txt' # bad dataset
# file_path= './data/NCDC/mideast/iran/chahbahar/dat.txt'
# file_path= './data/NCDC/mideast/iran/zabol/dat.txt' # Problematic data
# file_path= './data/NCDC/mideast/iran/torbat_heydarieh/dat.txt' # Unusable

file_path, bandwidth = "./data/NCDC/cn/shanghai/hongqiao_intl/dat.txt", 0.6
# file_path, bandwidth= "./data/NCDC/cn/shanghai/pudong/dat.txt", 0.8
# file_path, bandwidth= "./data/NCDC/cn/hefei_luogang/dat.txt", 0.6 # few 0, trend, try 2
# file_path, bandwidth= "./data/NCDC/cn/nanjing_lukou/dat.txt", 0.5
# file_path= "./data/NCDC/cn/zhengzhou_xinzheng/dat.txt" 
# file_path= "./data/NCDC/cn/tianjin/binhai/dat.txt" # few 0, trend, stationary speed, unstationary direction
# file_path= "./data/NCDC/cn/tianjin/tianjing/dat.txt" # 16 sectors
# file_path= "./data/NCDC/cn/shijiazhuang_zhengding/dat.txt" 
# file_path= "./data/NCDC/cn/henan_gushi/dat.txt" # 16 sectors, fit not very good
# file_path= "./data/NCDC/cn/nanning_wuxu/dat.txt" # numpy priblem, unstationary speed
# file_path= './data/NCDC/cn/macau/dat.txt'  
# file_path= "./data/NCDC/cn/hk_intl/dat.txt" # few 0
# file_path= './data/NCDC/cn/gaoqi/dat.txt' 

# file_path= './data/NCDC/southeast_asia/malaysia/mersing/dat.txt' # 2 mode, paper comparison
# file_path= './data/NCDC/southeast_asia/malaysia/penang/dat.txt'
# file_path= './data/NCDC/southeast_asia/malaysia/butterworth/dat.txt' # 2 mode 
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_mahmud/dat.txt" # stable
# file_path= "./data/NCDC/southeast_asia/malaysia/bsultan_ismail/dat.txt" # 
# file_path= "./data/NCDC/southeast_asia/singapore/changi/dat.txt" # trend, no 0, questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/paya_lebar/dat.txt" # questionary data
# file_path= "./data/NCDC/southeast_asia/singapore/seletar/dat.txt"
# file_path= "./data/NCDC/east_asia/cheongju_intl/dat.txt" # 2005-2009  may have problem, fit is good; numpy problem
# file_path= "./data/NCDC/east_asia/daegu_ab/dat.txt" # recent 5 year may have problem, but fit is generally good; numpy problem

# file_path, bandwidth= "./data/NCDC/oceania/auckland_intl/dat.txt", 0.9  # Good data, double mode
# file_path= "./data/NCDC/oceania/brisbane_archerfield/dat.txt" # high 0, few data 
# file_path= "./data/NCDC/oceania/narrandera/dat.txt" # high 0, few data
# file_path, bandwidth= "./data/NCDC/oceania/canberra/dat.txt", 0.7 # high 0, bad fit
# file_path, bandwidth, NUMBER_OF_GAUSSIAN= './data/NCDC/oceania/horsham/dat.txt', 0.9, 4 # get the peak

# file_path, bandwidth= './data/NCDC/us/boston_16nm/dat.txt', 0.9 # Offshore, mixed type

# file_path, bandwidth= './data/asos/olympia/hr_avg.csv', 0.5 # might block
# file_path, bandwidth, NUMBER_OF_GAUSSIAN  = './data/asos/bismarck_ND/hr_avg.csv', 1.1, 4
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/aberdeen_SD/hr_avg.csv', 1.7, 2 # only to 2012
# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/asos/minneapolis/hr_avg.csv', 1.1, 4
# file_path, bandwidth = './data/asos/lincoln_NE/hr_avg.csv', 0.9
# file_path, bandwidth = './data/asos/des_moines_IA/hr_avg.csv', 1.3
# file_path, bandwidth = './data/asos/springfield_IL/hr_avg.csv', 1.1 
# file_path, bandwidth = './data/asos/topeka/hr_avg.csv', 0.7 # High 0
# file_path, bandwidth = './data/asos/denver/hr_avg.csv', 1.3

# file_path, bandwidth, NUMBER_OF_GAUSSIAN = './data/NDAWN/baker/hr_avg.csv', 0.7, 4 
# file_path, bandwidth = './data/NDAWN/dickinson/hr_avg.csv', 0.6
# file_path = './data/NDAWN/rugby/hr_avg.csv'
# file_path = './data/NDAWN/bowman/hr_avg.csv'
# file_path = './data/NDAWN/grand_forks/hr_avg.csv'
# file_path = './data/NDAWN/williston/hr_avg.csv'
# file_path = './data/NDAWN/jamestown/hr_avg.csv'

# file_path, NUMBER_OF_GAUSSIAN = 'data/ECMWF/usa/47N123W/dat.csv', 4 # good enough
# file_path = 'data/ECMWF/venezuela/8N67W/dat.csv' # good enough, can be coorect, still need other investigation.
# file_path = 'data/ECMWF/chile/52S75W/dat.csv' # good enough
# file_path = 'data/ECMWF/iceland/65N17W/dat.csv' # good enough
# file_path, NUMBER_OF_GAUSSIAN = 'data/ECMWF/germany/49N9E/dat.csv', 4 # miss peak
# file_path = 'data/ECMWF/sudan/18N32E/dat.csv' # good enough
# file_path = 'data/ECMWF/china/24N121E/dat.csv' # good enough
# file_path, NUMBER_OF_GAUSSIAN = 'data/ECMWF/australia/37S142E/dat.csv', 4 # miss the peak
In [3]:
if "cn_database" in file_path: 
    df = read_cn_database(file_path)
elif 'NCDC' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df.rename(columns={'Date':'date','Dir':'dir','Spd':'speed','Type':'type','I.1':'wind_type'}, inplace=True)
    df = df[['date','HrMn','type','dir','speed','wind_type' ]]
    df.dropna(subset=['dir','speed'], inplace=True)
    integer_data = True
elif 'NDAWN' in file_path:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
elif 'asos' in file_path:
    # ASOS
    df = pd.read_csv(file_path, header=0, skipinitialspace=True, dtype={'HrMn':'object'})
    df['type']='default'
    df['wind_type']='default'
    df = df.dropna()
    convert_to_knot = False
    integer_data = False
    knot_unit = True
else:
    df = pd.read_csv(file_path, header=0, skipinitialspace=True)
    df.rename(columns={'U':'x','V':'y'}, inplace=True)
    df.x=-df.x
    df.y=-df.y
    df['speed']=np.sqrt(df.x**2+df.y**2)
    df['dir']=np.degrees(np.arctan2(df.y, df.x))%360
    df['time']=pd.to_datetime('1979-01-01T00:00:00Z')+pd.to_timedelta(df['time'], unit='h')
    df['date']=df['time'].dt.strftime('%Y%m%d')
    df['date']=df['date'].astype(int)
    df['HrMn']=df['time'].dt.strftime('%H00')
    df['type']='default'
    df['wind_type']='default'
    convert_to_knot = True
    integer_data = False
    cartesian = True
In [4]:
df
Out[4]:
date HrMn type dir speed wind_type
0 19560820 0000 FM-12 200 5.1 N
1 19560820 0300 FM-12 250 4.1 N
2 19560820 0600 FM-12 250 5.1 N
3 19560820 0900 FM-12 270 6.2 N
4 19560820 1200 FM-12 270 5.1 N
5 19560820 1800 FM-12 290 3.1 N
6 19560820 2100 FM-12 320 3.1 N
7 19560821 0000 FM-12 320 3.1 N
8 19560821 0300 FM-12 290 4.1 N
9 19560821 0600 FM-12 270 5.1 N
10 19560821 0900 FM-12 320 2.1 N
11 19560821 1200 FM-12 90 1.0 N
12 19560821 1800 FM-12 140 2.1 N
13 19560821 2100 FM-12 140 2.1 N
14 19560822 0300 FM-12 140 6.2 N
15 19560822 0600 FM-12 160 4.1 N
16 19560822 1200 FM-12 140 4.1 N
17 19560822 1800 FM-12 200 3.1 N
18 19560822 2100 FM-12 290 7.2 N
19 19560823 0300 FM-12 270 7.2 N
20 19560823 0900 FM-12 320 5.1 N
21 19560823 1200 FM-12 270 1.0 N
22 19560823 2100 FM-12 999 0.0 C
23 19560824 0000 FM-12 999 0.0 C
24 19560824 0300 FM-12 270 1.0 N
25 19560824 0600 FM-12 90 1.0 N
26 19560824 0900 FM-12 160 2.1 N
27 19560824 1200 FM-12 999 0.0 C
28 19560824 1800 FM-12 999 0.0 C
29 19560824 2100 FM-12 140 4.1 N
... ... ... ... ... ... ...
359333 20150301 0900 FM-15 270 4.0 N
359334 20150301 0930 FM-15 260 4.0 V
359335 20150301 1000 FM-15 250 4.0 V
359336 20150301 1030 FM-15 250 3.0 V
359337 20150301 1100 FM-15 260 4.0 N
359338 20150301 1130 FM-15 230 3.0 N
359339 20150301 1200 FM-15 230 2.0 V
359340 20150301 1230 FM-15 270 3.0 N
359341 20150301 1300 FM-15 240 2.0 V
359342 20150301 1330 FM-15 260 2.0 V
359343 20150301 1400 FM-15 250 2.0 V
359344 20150301 1430 FM-15 240 2.0 V
359345 20150301 1500 FM-15 999 0.0 C
359346 20150301 1530 FM-15 999 1.0 V
359347 20150301 1600 FM-15 999 1.0 V
359348 20150301 1630 FM-15 210 1.0 N
359349 20150301 1700 FM-15 999 1.0 V
359350 20150301 1730 FM-15 999 1.0 V
359351 20150301 1800 FM-15 999 1.0 V
359352 20150301 1830 FM-15 180 1.0 N
359353 20150301 1900 FM-15 160 1.0 N
359354 20150301 1930 FM-15 210 1.0 N
359355 20150301 2000 FM-15 230 2.0 N
359356 20150301 2030 FM-15 230 1.0 N
359357 20150301 2100 FM-15 240 2.0 N
359358 20150301 2130 FM-15 180 1.0 N
359359 20150301 2200 FM-15 160 1.0 N
359360 20150301 2230 FM-15 150 1.0 N
359361 20150301 2300 FM-15 190 2.0 N
359362 20150301 2330 FM-15 190 1.0 N

359363 rows × 6 columns

In [5]:
if 'NCDC' in file_path:
    lat, long = get_lat_long(file_path)
    print(lat,long)
    map_osm = folium.Map(location=[lat, long], zoom_start=4)
    folium.Marker([lat, long]).add_to(map_osm)
    display(map_osm)
31.198 121.336
In [6]:
df['time']=pd.to_datetime(df["date"].astype(str).map(str) + df["HrMn"], format='%Y%m%d%H%M')
df.set_index(['time'], inplace=True)
df['HrMn']=df['HrMn'].astype(int)
df = df.query("(dir <= 999) & (speed < 100) ")['1970':'2016']
In [7]:
plot_speed_and_angle_distribution(df.speed, df.dir)
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
In [8]:
# Dir [10,360]=> [0,350]
df['dir'] = df['dir'].apply(lambda x: x%360 if x < 999 else x) 
# Convert Windrose coordianates to Polar Cooridinates 
if 'cartesian' in globals():
    df['dir_windrose'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
else:
    df['dir_windrose'] = df['dir']
    df['dir'] = df['dir'].apply(lambda x: (90 - x)%360 if x < 999 else x)
display(df.describe())
df.plot(y='speed',legend=True,figsize=(20,5))
date HrMn dir speed dir_windrose
count 3.369710e+05 336971.000000 336971.000000 336971.000000 336971.000000
mean 2.000006e+07 1128.070137 268.228497 3.543890 248.807883
std 1.115119e+05 691.735348 275.924866 2.002307 279.529540
min 1.973010e+07 0.000000 0.000000 0.000000 0.000000
25% 1.991120e+07 530.000000 90.000000 2.000000 90.000000
50% 2.002062e+07 1100.000000 200.000000 3.000000 150.000000
75% 2.010050e+07 1730.000000 320.000000 5.000000 310.000000
max 2.015030e+07 2357.000000 999.000000 30.000000 999.000000
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0xcd97438>

1.3 General Data Info

1.3.1 Unit Detection

In [9]:
df['decimal'] = df.speed % 1
df.decimal.hist(alpha=0.5, label='m/s', figsize=(4, 3))

if 'convert_to_knot' not in globals():
    convert_to_knot = True if len(df.query('decimal >= 0.2')) / len(df) > 0.3 else False
    
if convert_to_knot:
    knot_unit = True
    df['speed'] = df['speed'] * 1.943845
    df['decimal'] = df.speed % 1
    df.decimal.hist(alpha=0.5, label='knot')
    # need more elaboration, some is not near an integer
    if integer_data:
        df['speed'] = df['speed'].apply(lambda x: int(round(x)))
    plt_configure(xlabel='Decimal', ylabel='Frequency', legend={'loc': 'best'}, title='Decimal Distribution')
else:
    knot_unit = False
    
df.drop(['decimal'], 1,inplace=True)
print(knot_unit)
False
In [10]:
dir_unit_text = ' (degree)'
if knot_unit == True:
    speed_unit_text = ' (knot)'
else: 
    speed_unit_text = ' (m/s)'

1.3.2 Sampling Type Selection

In [11]:
sample_type = df.query('date > 20000000')['type']
sample_type.value_counts().plot(
    kind = 'bar', title = 'Report Types Comprisement', figsize=(4,3))

report_type_most_used = sample_type.value_counts().argmax()
df = df.query("type==@report_type_most_used")

1.3.3 Sampling Time Selection

In [12]:
MID_YEAR = int(np.average(df.index.year))

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5,label='Overall')
df[str(MID_YEAR):]['HrMn'].value_counts().sort_index().plot(
    kind='bar', alpha=0.5, label='>= %s' %  MID_YEAR )

plt_configure(xlabel='Sampling Time', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 4), 
              title = 'Sampling Time Distribution, Overall and > %s ' %  MID_YEAR)
In [13]:
df['sample_time'] = df.HrMn % 100 
sample_time = df['2000':]['sample_time']
sample_times = sample_time.value_counts()[sample_time.value_counts() > 2000]
sample_times = sample_times.index.tolist()
# df = df.query("sample_time in @sample_times")
df = df.query("sample_time == @sample_times[0]")
df.drop(['sample_time'], 1,inplace=True)
print(sample_times)

df['HrMn'].value_counts().sort_index().plot(kind='bar', alpha=0.5, figsize=(10, 4))
[0, 30]
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x14af2ba8>

1.4 Error Data handling and Adjustment

1.4.1 Artefacts

wrong direction record

In [14]:
if integer_data:
    display(df.query("(dir % 10 >= 0.1) & (dir != 999)"))
    df = df.query('(dir % 10 <= 0.1) | (dir == 999)')
date HrMn type dir speed wind_type dir_windrose
time
1994-01-28 00:00:00 19940128 0 FM-15 119 3.0 N 331
1994-07-18 10:00:00 19940718 1000 FM-15 337 5.0 N 113
1994-08-05 11:00:00 19940805 1100 FM-15 335 9.0 N 115
1994-08-10 05:00:00 19940810 500 FM-15 319 10.0 N 131
1994-09-03 21:00:00 19940903 2100 FM-15 331 5.0 N 119
1994-12-03 14:00:00 19941203 1400 FM-15 316 3.0 N 134
1995-04-03 13:00:00 19950403 1300 FM-15 337 3.0 N 113
1998-06-03 11:00:00 19980603 1100 FM-15 59 10.0 N 31
1998-09-09 12:00:00 19980909 1200 FM-15 359 20.0 N 91

sudden increase in speed

In [15]:
# sudden increse
df['incre'] = df.speed.diff(1)
df['incre'].fillna(0, inplace=True)
df['incre_reverse'] = df.speed.diff(-1)
df['incre_reverse'].fillna(0, inplace=True)

display(df.sort_values(by='speed',ascending=False).head(10))
df['incre'].plot(kind='hist', bins=arange(-15, 15), legend=True, figsize=(8, 3))
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
1993-12-21 22:00:00 19931221 2200 FM-15 110 22.0 N 340 19.0 20.0
1993-08-02 14:00:00 19930802 1400 FM-15 140 21.0 N 310 17.0 21.0
1985-04-30 08:00:00 19850430 800 FM-15 310 16.0 N 140 9.8 9.8
2005-09-11 17:00:00 20050911 1700 FM-15 0 16.0 N 90 4.0 4.0
1981-12-19 07:00:00 19811219 700 FM-15 140 15.9 N 310 13.9 3.1
1994-10-20 08:00:00 19941020 800 FM-15 140 15.0 N 310 6.0 8.0
2012-08-08 05:00:00 20120808 500 FM-15 10 15.0 N 80 0.0 2.0
1995-11-07 04:00:00 19951107 400 FM-15 110 15.0 N 340 5.0 2.0
2012-08-08 04:00:00 20120808 400 FM-15 0 15.0 N 90 2.0 0.0
2008-07-02 08:00:00 20080702 800 FM-15 170 15.0 N 280 10.0 13.0
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xce409e8>
In [16]:
incre_threshold = 20 if knot_unit else 10
print('sudden increase number', len(df.query('(incre > @incre_threshold )&(incre_reverse > @incre_threshold )')))
df = df.query('(incre < @incre_threshold )|(incre_reverse < @incre_threshold )')

# Check the max speed
display(df.sort_values(by='speed',ascending=False).head(10))
df.drop(['incre', 'incre_reverse'], 1, inplace=True)
sudden increase number 2
date HrMn type dir speed wind_type dir_windrose incre incre_reverse
time
2005-09-11 17:00:00 20050911 1700 FM-15 0 16.0 N 90 4.0 4.0
1985-04-30 08:00:00 19850430 800 FM-15 310 16.0 N 140 9.8 9.8
1981-12-19 07:00:00 19811219 700 FM-15 140 15.9 N 310 13.9 3.1
2012-08-08 04:00:00 20120808 400 FM-15 0 15.0 N 90 2.0 0.0
2012-08-08 05:00:00 20120808 500 FM-15 10 15.0 N 80 0.0 2.0
1995-11-07 04:00:00 19951107 400 FM-15 110 15.0 N 340 5.0 2.0
1995-03-09 15:00:00 19950309 1500 FM-15 110 15.0 N 340 4.0 3.0
1994-10-20 08:00:00 19941020 800 FM-15 140 15.0 N 310 6.0 8.0
2013-03-09 17:00:00 20130309 1700 FM-15 80 14.0 N 10 8.0 2.0
2005-08-06 03:00:00 20050806 300 FM-15 350 14.0 N 100 4.0 1.0

1.4.2 Direction re-aligment

For some dataset, the 16 sectors are not record properly,

e.g. the sectors are [0,20,50 ...], need to redistribute the angle into 22.5, e.g. [0, 22.5, 45...]

In [17]:
display(df['dir'].value_counts().sort_index())
effective_column = df.query('dir < 999')['dir'].value_counts()[df['dir'].value_counts() > 30].sort_index()
if integer_data:
    SECTOR_LENGTH = 360/len(effective_column) 
else: 
    SECTOR_LENGTH = 10
print(len(effective_column), SECTOR_LENGTH)
0       5114
10      3736
20      3931
30      4370
40      4484
50      6036
60      6681
70      5524
80      5811
90      8968
100     8340
110     7085
120     7931
130     6770
140     4428
150     3796
160     3319
170     2573
180     2191
190     1555
200     1578
210     1991
220     1885
230     1882
240     2132
250     2360
260     3096
270     4399
280     4881
290     7184
300    11033
310     9313
320     9832
330    10032
340     7915
350     6258
999    25824
Name: dir, dtype: int64
36 10.0
In [18]:
df=realign_direction(df, effective_column)

1.4.3 0 Speed

In [19]:
with_too_many_zero, null_wind_frequency = is_with_too_many_zero(df['2005':])
delete_zero = with_too_many_zero
if delete_zero:
    df = df.query('(speed > 0)')
print(delete_zero, null_wind_frequency)
False 0.0412621359223
In [20]:
print(df.query('dir == 999')['speed'].value_counts())
df=fill_direction_999(df, SECTOR_LENGTH)
0.0    15499
2.0     5486
1.0     4531
3.0      229
4.0       43
5.0       15
3.1        8
4.1        4
2.1        3
7.0        2
5.1        2
6.0        2
Name: speed, dtype: int64

1.5 Time Shift Comparison

In [21]:
DIR_REDISTRIBUTE = 'even'
if DIR_REDISTRIBUTE == 'even':
    DIR_BIN = arange(-5, 360, 10) 
elif DIR_REDISTRIBUTE == 'round_up':
    DIR_BIN = arange(0, 360+10, 10) 

# Comparison between mid_year, looking for: 
# 1. Odd Even Bias
# 2. Time Shift of Wind Speed Distribution
bins = arange(0, df.speed.max() + 1)
df[:str(MID_YEAR)]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['speed'].plot(
    kind='hist', alpha=0.5,bins=bins, label='> %s' % MID_YEAR)

plt.suptitle('Speed Comparison between year < %s, > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Speed', ylabel='Frequency', legend=True, figsize=(8, 3))
In [22]:
df[:str(MID_YEAR)]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='< %s' % MID_YEAR)

df[str(MID_YEAR+1):]['dir'].plot(
    kind='hist', alpha=0.5,bins=DIR_BIN, label='> %s' % MID_YEAR)

plt.suptitle('Dir Comparison between year < %s, and > %s ' % (MID_YEAR, MID_YEAR), fontsize = 14)
plt_configure(xlabel='Dir', ylabel='Frequency', legend={'loc':'best'}, figsize=(8, 3), tight='x')
In [23]:
display(df[df['dir'].isnull()])
df.dropna(subset=['dir'], inplace=True)
date HrMn type dir speed wind_type dir_windrose
time
In [24]:
# Inspect the time shift of speed and degree distribution, and odd-even bias
check_time_shift(df, speed_unit_text=speed_unit_text, dir_unit_text=dir_unit_text)
1979 - 1980
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
1981 - 1985
1986 - 1990
1991 - 1995
1996 - 2000
2001 - 2005
2006 - 2010
2011 - 2015
In [25]:
df.resample('A').mean().plot(y='speed')
plt.gca().set_ylim(bottom=0)
df.resample('M').mean().plot(y='speed', figsize=(20,4))
plt.gca().set_ylim(bottom=0)
Out[25]:
(0, 6.0)
In [26]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    den, _ = np.histogram(df[column], bins=bins, density=True)
    y_top=max(den)*1.2
    for year in arange(1980, 2016):
        end_year = year
        sub_df = df[str(year):str(end_year)]
        if len(sub_df) > 1000:
            plt.figure()
            df[column].hist(bins=bins, alpha=0.3, normed=True)
            sub_df[column].hist(bins=bins, alpha=0.5, figsize=(3,1.5), normed=True)
            plt.gca().set_ylim(top=y_top)
            plt_configure(title=str(year))
    align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
In [27]:
for column in ['speed', 'dir']:
    if column == 'speed':
        bins = arange(0, df[column].max()+1, 1)
    else:
        bins = arange(0, 361, 10)
    density_all, _ = np.histogram(df[column], bins=bins, density=True)
    df[column].hist(bins=bins, figsize=(5,3))

    R_squares = []
    years = []
    for year in arange(1980, 2016):
        start_year, end_year = year-1, year+1
        sub_df = df[str(start_year):str(end_year)]
        if len(sub_df) > 1000:
            density, _ = np.histogram(sub_df[column], bins=bins, density=True)
            y_mean = np.mean(density_all)
            SS_tot = np.sum(np.power(density_all - y_mean, 2))
            SS_res = np.sum(np.power(density_all - density, 2))

            R_square = 1 - SS_res / SS_tot
            R_squares.append(R_square)
            years.append(year)

    plt.figure()
    plot(years, R_squares)
    ylim = max(min(plt.gca().get_ylim()[0],0.85),0)
    plt.gca().set_ylim(bottom=ylim, top=1)
    plt_configure(figsize=(5,3))
    align_figures()

1.6 Re-distribute Direction and Speed (Optional)

e.g. Dir 50 -> -45 ~ 55, to make KDE result better

In [28]:
if integer_data:
    df = randomize_angle(df, DIR_REDISTRIBUTE, SECTOR_LENGTH)
In [29]:
if integer_data:
    if delete_zero:
        redistribute_method = 'down'
    else:
        redistribute_method = 'up'

    df, speed_redistribution_info = randomize_speed(df, redistribute_method)
Redistribute upward, e.g. 0 -> [0,1]

1.7 Generate (x,y) from (speed,dir)

In [30]:
# Cook orientation
# df['dir']= (df['dir'] + 180)%360
In [31]:
# There might be a small dot in the centre, which is due to too many zero (more than 1 speed) in center
# Scatter plot in matplot has performance issue, the speed is very slow
df['x'] = df['speed'] * cos(df['dir'] * pi / 180.0)
df['y'] = df['speed'] * sin(df['dir'] * pi / 180.0)

2. Re-select Data and Overview

2.1 Data Overview

In [32]:
## Summery of the data selection
print('Knot unit?', knot_unit)
print('Report type used:', report_type_most_used)
print('Sampling time used:', sample_times)
if 'speed_redistribution_info' in globals():
    print('Speed redistribution info:', speed_redistribution_info )

df_all_years = df # for later across-year comparison
df = df_all_years['2011':'2015']
# df = df.query('(HrMn == 0) and (speed >= 0.5) and (date%10000 > 900) and (date%10000 < 1000)' )
df.describe()
Knot unit? False
Report type used: FM-15
Sampling time used: [0, 30]
Speed redistribution info: Redistribute upward, e.g. 0 -> [0,1]
Out[32]:
date HrMn dir speed dir_windrose x y
count 3.635200e+04 36352.000000 36352.000000 36352.000000 36352.000000 36352.000000 36352.000000
mean 2.012661e+07 1149.460827 181.216628 4.545738 206.211653 1.004473 0.486174
std 1.194965e+04 692.188887 113.942791 1.959806 230.381866 3.037031 3.746494
min 2.011010e+07 0.000000 -4.998856 0.001584 0.000000 -12.233276 -11.686966
25% 2.012012e+07 500.000000 82.649318 3.175702 70.000000 -0.993980 -2.369535
50% 2.013013e+07 1100.000000 158.731315 4.397547 140.000000 1.368626 0.280117
75% 2.014021e+07 1700.000000 300.804551 5.764333 280.000000 3.293004 3.443985
max 2.015030e+07 2300.000000 354.985605 15.332590 999.000000 15.164576 13.787213
In [33]:
df.plot(y='speed',legend=True,figsize=(20,5))
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1bb0e240>
In [34]:
# Accumulation by month
df.resample('M').count().plot(y='date', kind='bar',figsize=(20,4))
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c2cfb38>
In [35]:
# 90 degree is east
ax = WindroseAxes.from_ax()
viridis = plt.get_cmap('viridis')
ax.bar(df.dir_windrose, df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=viridis)
ax.set_legend()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [36]:
if len(df) > 1000000:
    bins=arange(0,362)
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min')
    
    df = df_all_years.sample(n=500000, replace=True)    
    df['dir'].hist(bins=bins, normed=True,alpha=0.5,label='min resmapled')
    plt_configure(legend=True, figsize=(20,4))
In [37]:
x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed)

# 1. Histogram comparison
fig = plt.figure()
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data', normed=True)             
plot(x, y_weibull, '-', color='black',label='Weibull')   
plt_configure(figsize=(4,3),xlabel='V',ylabel='PDF', legend=True)

# 2. CDF comparison
fig = plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label='ECDF')
plot(log(x), log(-log(1-y_cdf_weibull)),'-', label='Weibull')
plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'}, figsize=(4,3))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
In [38]:
df.plot(kind='scatter', x='x', y='y', alpha=0.05, s=2)
plt.gca().set_aspect('equal')
plt_configure(figsize=(3.2,3.2),xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)

2.2 Overview by Direction

In [39]:
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 10
In [40]:
%%time
original_incre, incre = SECTOR_LENGTH, rebinned_angle
start, end = -original_incre/2 + incre/2, 360

max_speed = df.speed.max()
max_count = max_count_for_angles(df, start, end, incre)
plot_range = [0, max_speed, 0, max_count*1.05]

for angle in arange(start, end, incre):
    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)   
    
    fig = plt.figure()
    sub_df['speed'].hist(bins=arange(0, max_speed), alpha=0.5, label='Data')
    title ='%s (%s - %s), %s' % (angle, start_angle, end_angle, len(sub_df)) 
    plt.axis(plot_range)
    plt_configure(figsize=(3,1.5), title=title)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
Wall time: 5.82 s

2.3 Overview by Month

In [41]:
%%time
month_incre = 1
current_df = df.query('speed>=1')
for month in arange(1, 13): 
    sub_df = current_df[current_df.index.month == month]
    ax = WindroseAxes.from_ax()
    ax.bar(sub_df.dir_windrose, sub_df.speed, normed=True, opening=0.8, edgecolor='white', nsector=36, cmap=plt.get_cmap('viridis'))
    plt_configure(figsize=(3,3), title='Month: %s'%(month))
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Wall time: 17.9 s
In [42]:
df.describe()
Out[42]:
date HrMn dir speed dir_windrose x y
count 3.635200e+04 36352.000000 36352.000000 36352.000000 36352.000000 36352.000000 36352.000000
mean 2.012661e+07 1149.460827 181.216628 4.545738 206.211653 1.004473 0.486174
std 1.194965e+04 692.188887 113.942791 1.959806 230.381866 3.037031 3.746494
min 2.011010e+07 0.000000 -4.998856 0.001584 0.000000 -12.233276 -11.686966
25% 2.012012e+07 500.000000 82.649318 3.175702 70.000000 -0.993980 -2.369535
50% 2.013013e+07 1100.000000 158.731315 4.397547 140.000000 1.368626 0.280117
75% 2.014021e+07 1700.000000 300.804551 5.764333 280.000000 3.293004 3.443985
max 2.015030e+07 2300.000000 354.985605 15.332590 999.000000 15.164576 13.787213

3. Create input data and configuration

In [43]:
SPEED_SET = array(list(zip(df.x, df.y)))
if 'NUMBER_OF_GAUSSIAN' not in globals():
    NUMBER_OF_GAUSSIAN = 3
FIT_METHOD = 'square_error'
DEFAULT_BANDWDITH = 1.5 if knot_unit else 0.7
fig_list = []
In [44]:
fit_limit = ceil(df['speed'].quantile(.95))
fitting_axis_range = arange(-fit_limit, fit_limit+1, 1)
print(fitting_axis_range)

FITTING_RANGE = []
for i in fitting_axis_range:
    for j in fitting_axis_range:
        FITTING_RANGE.append([i,j])
[-8 -7 -6 -5 -4 -3 -2 -1  0  1  2  3  4  5  6  7  8]
In [45]:
plot_limit = ceil(df['speed'].quantile(.95))
PLOT_AXIS_RANGE = arange(-plot_limit, plot_limit+1, 1)

4. Kernel Density Estimation

In [46]:
sample = SPEED_SET
KDE_KERNEL = 'gaussian'
# KDE_KERNEL, bandwidth = 'tophat', 1
In [47]:
%%time
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH
    from sklearn.grid_search import GridSearchCV
    # from sklearn.model_selection import GridSearchCV  ## too slow

    # The bandwidth value sometimes would be too radical
    if knot_unit:
        bandwidth_range = arange(0.7,2,0.2)
    else:
        bandwidth_range = arange(0.4,1,0.1)

    # Grid search is unable to deal with too many data (a long time is needed)
    if len(sample) > 50000:    
        df_resample=df.sample(n=50000, replace=True)
        bandwidth_search_sample = array(list(zip(df_resample.x, df_resample.y)))
    else:
        bandwidth_search_sample = sample

    grid = GridSearchCV(neighbors.KernelDensity(kernel = KDE_KERNEL),
                    {'bandwidth': bandwidth_range}, n_jobs=-1, cv=4) 

    grid.fit(bandwidth_search_sample)
    bandwidth = grid.best_params_['bandwidth']
    
print(bandwidth)
0.6
Wall time: 0 ns
In [48]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH

kde = neighbors.KernelDensity(bandwidth=bandwidth, kernel = KDE_KERNEL).fit(sample)

points = FITTING_RANGE
# very slow if the dataset is too large, e.g. 100,000
# kde returns log prob, need to convert it
kde_result = exp(kde.score_samples(points))
print('bandwidth:', bandwidth, len(kde_result))
print(kde_result[:5])
bandwidth: 0.6 289
[  2.65944997e-07   4.59101114e-07   3.74956674e-06   1.96597950e-05
   3.46558171e-05]
In [49]:
# Plot jPDF
X = Y = PLOT_AXIS_RANGE
# Can't work if pass as generate_Z_from_X_Y(X,Y, exp(kde.score_samples())), need to use lambda
# see http://stackoverflow.com/questions/21035437/passing-a-function-as-an-argument-in-python
kde_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(kde.score_samples(coords)))
colorbar_lim = 0, kde_Z.max()

plot_3d_prob_density(X,Y,kde_Z)

fig_kde,ax1 = plt.subplots(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, ax=ax1)

with sns.axes_style({'axes.grid' : False}):
    from matplotlib import ticker
    fig_hist,ax2 = plt.subplots(figsize=(3.5,2.5))
    _,_,_,image = ax2.hist2d(df.x, df.y, bins=PLOT_AXIS_RANGE, cmap='viridis',)
    ax2.set_aspect('equal')
    cb = plt.colorbar(image)
    tick_locator = ticker.MaxNLocator(nbins=6)
    cb.locator = tick_locator
    cb.update_ticks()
    plt_configure(ax=ax2, xlabel='x'+speed_unit_text,ylabel='y'+speed_unit_text)
align_figures()
In [50]:
kde_cdf = cdf_from_pdf(kde_result)
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}
In [51]:
%%time
gof_kde=Parallel(n_jobs=-1)(delayed(resampled_kde)(df, kde_result, config) 
                                       for i in arange(50)) 
Wall time: 12.1 s
In [52]:
for gof_name in [ 'R_square', 'K_S','Chi_square']:
    plt.figure(figsize=(4,3))
    pd.DataFrame(gof_kde)[gof_name].hist()
    plt_configure(title=gof_name)
align_figures()
In [53]:
# %%time
year_length = 5
gofs_bivariate = []
df_start_year, df_end_year = df_all_years.index.year[0], df_all_years.index.year[-1]
for start_year in arange(df_start_year, df_end_year-year_length):
    end_year = start_year+year_length-1
    df_previous = df_all_years[str(start_year):str(end_year)]
    speed_previous = array(list(zip(df_previous.x, df_previous.y)))
    kde2 = neighbors.KernelDensity(bandwidth=bandwidth, kernel=KDE_KERNEL).fit(speed_previous)
    kde_result2 = exp(kde2.score_samples(points))
    gofs_bivariate.append(goodness_of_fit_summary(kde_result2, kde_result))
gofs_bivariate=pd.DataFrame(gofs_bivariate)
gofs_bivariate.index = arange(df_start_year, df_end_year-year_length)
In [54]:
gofs_bivariate
Out[54]:
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1979 802.085186 0.072152 3.761143e-06 0.118174 0.570986 0.722794
1980 0.539279 0.081185 3.824481e-06 0.119165 0.575773 0.718126
1981 112.728025 0.076514 4.098733e-06 0.123363 0.596060 0.697913
1982 167.553974 0.080018 4.138213e-06 0.123956 0.598924 0.695003
1983 263.223514 0.087966 4.172916e-06 0.124475 0.601430 0.692446
1984 161.172342 0.135298 1.584599e-05 0.242561 1.171993 -0.167889
1985 3.783908 0.121449 1.420186e-05 0.229633 1.109527 -0.046713
1986 0.166531 0.089923 9.005964e-06 0.182863 0.883548 0.336238
1987 0.153836 0.097636 7.957578e-06 0.171891 0.830531 0.413507
1988 0.152328 0.110815 7.038154e-06 0.161656 0.781078 0.481270
1989 0.126984 0.093474 4.714281e-06 0.132303 0.639253 0.652546
1990 0.109780 0.084216 3.551416e-06 0.114832 0.554838 0.738252
1991 0.139116 0.103113 4.387427e-06 0.127634 0.616695 0.676636
1992 0.114950 0.094546 3.956182e-06 0.121199 0.585603 0.708420
1993 0.116579 0.083940 3.752497e-06 0.118038 0.570329 0.723432
1994 0.132093 0.088618 3.993954e-06 0.121776 0.588392 0.705636
1995 0.155100 0.099329 5.152162e-06 0.138311 0.668282 0.620273
1996 0.138222 0.092232 4.969471e-06 0.135837 0.656327 0.633738
1997 0.119654 0.087624 3.846433e-06 0.119506 0.577423 0.716508
1998 0.111374 0.094331 3.604100e-06 0.115680 0.558938 0.734369
1999 0.112910 0.096082 3.419320e-06 0.112676 0.544421 0.747988
2000 0.124339 0.093494 2.911175e-06 0.103967 0.502342 0.785439
2001 0.137330 0.094211 2.941306e-06 0.104504 0.504935 0.783218
2002 8.024373 0.106958 3.625153e-06 0.116018 0.560568 0.732817
2003 0.158958 0.104181 3.504101e-06 0.114064 0.551129 0.741739
2004 0.139002 0.099466 3.409883e-06 0.112520 0.543670 0.748683
2005 0.124149 0.094434 3.064870e-06 0.106676 0.515432 0.774111
2006 0.100283 0.080361 2.475625e-06 0.095875 0.463242 0.817540
2007 0.067844 0.056988 1.529106e-06 0.075350 0.364069 0.887301
2008 0.084628 0.036044 7.970651e-07 0.054401 0.262853 0.941254
2009 0.011070 0.024374 2.631360e-07 0.031257 0.151027 0.980606
In [55]:
gofs_bivariate.plot(y='R_square', figsize=(4,3))
gofs_bivariate.plot(y='K_S', figsize=(4,3))
align_figures()

univariate gof standard

In [56]:
def yearly_gof(df_all_years, start_year, end_year, density, y_ecdf, x):
    df_previous = df_all_years[str(start_year):str(end_year)]
    density_expected, _ = np.histogram(df_previous['speed'], bins=x, normed=True)
    r_square = sector_r_square(density, density_expected)
    
    y_ecdf_previous = sm.distributions.ECDF(df_previous['speed'])(x)
    k_s = max(np.abs(y_ecdf - y_ecdf_previous))
    return {'year': start_year, 'r_square': r_square, 'k_s': k_s}
In [57]:
x = arange(0, df.speed.max() + 1)
fig = plt.figure()
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

for year_length in arange(5, 11):
    df_standard = df_all_years[str(2010):str(2014)]
    density, _ = np.histogram(df_standard['speed'], bins=x, normed=True)
    y_ecdf = sm.distributions.ECDF(df_standard.speed)(x)

    gofs = [yearly_gof(df_all_years, start_year, start_year+year_length-1, density, y_ecdf, x) 
            for start_year in arange(df_start_year, df_end_year-year_length)]

    gofs = pd.DataFrame(gofs)
    if len(gofs)>0:
        ax1.plot(gofs.year, gofs.r_square, label=year_length)
        ax2.plot(gofs.year, gofs.k_s, label=year_length)
plt.legend()
Out[57]:
<matplotlib.legend.Legend at 0x14f3b6d8>

5. GMM by Expectation-maximization

In [58]:
sample= SPEED_SET
clf = mixture.GaussianMixture(n_components=NUMBER_OF_GAUSSIAN, covariance_type='full')
clf.fit(sample)
print(clf.converged_)
True
In [59]:
gmm_em_result = read_gmm_em_result(clf)
pretty_print_gmm(gmm_em_result)
Out[59]:
weight mean_x mean_y sig_x sig_y corr
1 0.423 2.816 -2.610 1.910 2.245 0.063
2 0.298 1.634 3.809 2.326 2.337 -0.232
3 0.279 -2.409 1.632 2.166 3.032 -0.107
In [60]:
fig,ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm_em_result, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.422928679129 [[ 2.81565941 -2.61004334]] [ 1.89630976  2.25658221] 169.338468227
0.297715433637 [[ 1.63419676  3.80935132]] [ 2.04267928  2.58809133] -135.60362313
0.279355887234 [[-2.4086695   1.63208685]] [ 2.14115922  3.04997628] -171.322354158
In [61]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, lambda coords: exp(clf.score_samples(coords)))

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = exp(clf.score_samples(points))
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig_em = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z,xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar_lim=colorbar_lim)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()

Goodness-of-fit Statistics

In [62]:
points = FITTING_RANGE
gmm_pdf_result = exp(clf.score_samples(points))
gof_df(gmm_pdf_result, kde_result)
Out[62]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.952 0.022 0.030 6.478512e-07 0.049 0.237
In [63]:
gmm_em = group_gmm_param_from_gmm_param_array(gmm_em_result, sort_group = True)
mixed_model_pdf_em = generate_gmm_pdf_from_grouped_gmm_param(gmm_em)

6. GMM by Optimization

In [64]:
sample = SPEED_SET
points = FITTING_RANGE
max_speed = df.speed.max()
print(FIT_METHOD)
square_error
In [65]:
# from GMM,EM 
# GMM format: weight, meanx, meany, sigx, sigy, rho
x0 = gmm_em_result

cons = [
        # sum of every 6th element, which is the fraction of each gaussian
        {'type': 'eq', 'fun': lambda x: sum(x[::6]) - 1},
        # # limit the width/height ratio of elliplse, optional
#         {'type': 'ineq', 'fun': lambda x: width_height_ratios_set(x) - 1/3},
#         {'type': 'ineq', 'fun': lambda x: 3 - width_height_ratios_set(x)},
]

bonds = [(0., 0.99),(-fit_limit, fit_limit),(-fit_limit, fit_limit),
         (0., fit_limit),(0., fit_limit),(-0.99, 0.99)]*(len(x0)//6)

result = sp.optimize.minimize(
    lambda x0: GMM_fit_score(x0, kde_result, points, FIT_METHOD),
    x0,
    bounds = bonds,
    constraints=cons,
    tol = 0.000000000001,
    options = {"maxiter": 500})
result
Out[65]:
     fun: -15.101445121024044
     jac: array([  2.54845250e+00,  -1.90734863e-06,   5.96046448e-07,
         0.00000000e+00,   1.19209290e-07,   4.76837158e-07,
         2.54845166e+00,   0.00000000e+00,  -5.96046448e-07,
         3.57627869e-07,   5.96046448e-07,   3.57627869e-07,
         2.54845881e+00,  -1.19209290e-07,   3.57627869e-07,
        -1.19209290e-07,  -2.38418579e-07,   1.19209290e-07,
         0.00000000e+00])
 message: 'Optimization terminated successfully.'
    nfev: 982
     nit: 48
    njev: 48
  status: 0
 success: True
       x: array([ 0.34274839,  3.37305002, -1.95312609,  1.63451716,  2.82930496,
        0.24103574,  0.41645148, -0.16539615,  3.78042477,  2.87064024,
        2.04322317, -0.00899645,  0.24080013, -0.2334316 , -1.92394324,
        3.08911086,  2.01074885, -0.51950802])

6.1 GMM Result

In [66]:
gmm = group_gmm_param_from_gmm_param_array(result.x, sort_group = True)
mixed_model_pdf = generate_gmm_pdf_from_grouped_gmm_param(gmm)
gmm_pdf_result = mixed_model_pdf(points)
pretty_print_gmm(gmm)
Out[66]:
weight mean_x mean_y sig_x sig_y corr
1 0.416 -0.165 3.780 2.871 2.043 -0.009
2 0.343 3.373 -1.953 1.635 2.829 0.241
3 0.241 -0.233 -1.924 3.089 2.011 -0.520
In [67]:
fig_gmm, ax = plt.subplots(figsize=(3.5,3.5))
plot_gmm_ellipses(gmm, ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
GMM Plot Result
0.416451481358 [[-0.16539615  3.78042477]] [ 2.04305561  2.8707595 ] -90.7434354504
0.342748388928 [[ 3.37305002 -1.95312609]] [ 1.56462411  2.86854742] 168.657378216
0.240800129714 [[-0.2334316  -1.92394324]] [ 1.59790315  3.32150905] -114.782233675

6.2 Goodness-of-fit statistics

In [68]:
gof_df(gmm_pdf_result, kde_result)
Out[68]:
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.014 0.059 2.763922e-07 0.032 0.155
In [69]:
X = Y = PLOT_AXIS_RANGE
pdf_Z = generate_Z_from_X_Y(X,Y, mixed_model_pdf)# passing a function as an argument

def residule_between_kde_and_gmm(points):
    kde_vals = exp(kde.score_samples(points))
    gmm_vals = mixed_model_pdf(points)
    return kde_vals - gmm_vals 

residual_Z = generate_Z_from_X_Y(X,Y, residule_between_kde_and_gmm)

plot_3d_prob_density(X,Y,pdf_Z)
plot_3d_prob_density(X,Y,residual_Z)
align_figures()

fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,kde_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig_gmm = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,pdf_Z, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
fig = plt.figure(figsize=(3.5,2.5))
plot_2d_prob_density(X,Y,residual_Z,  xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
align_figures()
In [70]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='', ylabel='', colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='', ylabel='', colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [71]:
def f(V,theta):
    return (mixed_model_pdf([[V*cos(theta),V*sin(theta)]]))*V

def f_em(V,theta):
    return (mixed_model_pdf_em([[V*cos(theta),V*sin(theta)]]))*V
In [72]:
%%time
x = arange(0, max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(df.speed, x= x)
Wall time: 11.4 s
In [73]:
%%time
# Calculate Speed Distribution
# 1. GMM Model
y_ =[integrate.nquad(f, [[x_-0.01, x_+0.01],[0, 2*pi]]) for x_ in x]
y_gmm = array(list(zip(*y_))[0])/0.02

# 2. Weibull
y_weibul = sp.stats.weibull_min.pdf(x, *weibull_params)

# 3. Plot Comparison
df['speed'].hist(bins=arange(0, df.speed.max()), alpha=0.5, label='Data')
plot(x, y_gmm*len(df.speed),'-', color='black', label='GMM')
plot(x, y_weibul*len(df.speed), '--', color='black', label='Weibull') 
print('Speed Distribution Comparison')
plt_configure(xlabel='Speed'+speed_unit_text,
              ylabel='Frequency',legend=True, figsize=(4, 2))
plt.gca().set_ylim(bottom = 0)
plt.tight_layout()
plt.locator_params(axis='y', nbins=5)

# 4. R square for GMM, Weibull
print(R_square_for_speed(df['speed'], f, weibull_params, f_em))
Speed Distribution Comparison
(0.9908810749017708, 0.98095304136288908, 0.98481676199426671)
Wall time: 8.95 s
In [74]:
%%time
y_ = [integrate.nquad(f, [[0, x_val],[0, 2*pi]]) for x_val in x]
y_cdf_gmm = array(list(zip(*y_))[0])

# 5.2. CDF Comaprison
plot(x, y_ecdf,'o', alpha=0.8, label='Data')
plot(x, y_cdf_gmm,'-', color='black',label='GMM')
plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
plt_configure(xlabel = "V", ylabel='P', legend=True, figsize=(4,3))

plt.figure()
plot(log(x), log(-log(1-y_ecdf)),'o', label = 'Empirical')
plot(log(x), log(-log(1-y_cdf_weibull)),'--', label = 'Weibull')
plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label = 'GMM')
plt_configure(xlabel='ln(V)',ylabel='ln(-ln(1-P))',legend={'loc':'best'}, figsize=(4,3))
align_figures()

cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
print(cdf_diff.max(), cdf_diff_weibull.max()) 
print(x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:12: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:13: RuntimeWarning: divide by zero encountered in log
0.0176342265769 0.0747080733827
2.0 5.0
Wall time: 9.59 s
In [75]:
# Calculate Angle Distribution
x = linspace(0,2*pi, num=36+1)
y_ =[integrate.nquad(f, [[0, inf],[x_-pi/36, x_+pi/36]]) for x_ in x]
y = array(list(zip(*y_))[0])*len(df['dir']) 

df['dir'].hist(bins=DIR_BIN, alpha=0.5, label='Data')
plot(x/pi*180, y,'-', color='black', label='GMM')
plt_configure(xlabel='Direction'+dir_unit_text, ylabel='Frequency', 
              legend={'loc': 'best'} ,tight='xtight',figsize = (4,2))
plt.tight_layout()
dir_fig = plt.gcf()
print('Direction Distribution Comparison')
Direction Distribution Comparison
In [76]:
%%time
incre = max(SECTOR_LENGTH, 10)
density_collection=Parallel(n_jobs=-1)(delayed(direction_compare)(gmm, df, angle, incre) 
                                        for angle in arange(0, 360, incre))  
# This R square is computed as in paper 
# Comparison of bivariate distribution constructionapproaches for analysing wind speed anddirection data
# http://onlinelibrary.wiley.com/doi/10.1002/we.400/full
print(true_R_square(density_collection))
0.913884447954
Wall time: 8.34 s

6.3 Sectoral Comaprison

In [77]:
# %%time
# curve_collection=Parallel(n_jobs=-1)(delayed(direction_compare2)
#                                      (gmm, df, angle, incre, complex=True) for angle in arange(start, end, incre))  
In [78]:
# Calculate Speed Distribution
def model_data_comparison(df, original_incre = 10, incre = 10):
    start, end = -original_incre/2 + incre/2, 360
    curve_collection = []
    max_speed = df.speed.max()
    
    # Find a max count for plotting histogram
    max_count = max_count_for_angles(df, start, end, incre)
    plot_range = [0, max_speed, 0, max_count*1.05]
    
    for angle in arange(start, end, incre):
        angle_radian, incre_radian = np.radians([angle, incre])  
        start_angle, end_angle = angle-incre/2, angle+incre/2
        
        # 0. Select data from observation
        sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
        data_size = len(sub_df.speed)
        # 1. Get Weibull and ECDF
        x, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed)
        # 2. Get GMM PDF, CDF
        _, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)
        
        # 3. R square for GMM, Weibull
        bins = arange(0, sub_df.speed.max()+1)
        density, _ = np.histogram(sub_df['speed'], bins=bins, normed=True)
        density_expected_gmm_ =[integrate.nquad(f, [[x_, x_+1],[angle_radian-incre_radian/2, angle_radian+incre_radian/2]]) 
                            for x_ in bins[:-1]]
        density_expected_gmm = array(list(zip(*density_expected_gmm_ ))[0])/direction_prob
        R_square_gmm = sector_r_square(density, density_expected_gmm)
        
        density_expected_weibull = sp.stats.weibull_min.cdf(bins[1:], *weibull_params) - sp.stats.weibull_min.cdf(bins[:-1], *weibull_params) 
        R_square_weibull = sector_r_square(density, density_expected_weibull)

        # 4. K-S for GMM, Weibull
        cdf_diff, cdf_diff_weibull= np.abs(y_ecdf - y_cdf_gmm), np.abs(y_ecdf - y_cdf_weibull)
                
        # 5. Make Plots
        fig = plt.figure(figsize=(10,1.9))
        # 5.1. Frequency Comparison
        ax1 = fig.add_subplot(1,3,1)        
        sub_df['speed'].hist(bins=arange(0, sub_max_speed), alpha=0.5, label='Data')                  
        plot(x, y_gmm*data_size,'-', color='black', label='GMM')
        plot(x, y_weibull*data_size, '--', color='black',label='Weibull')   
        plt_configure(xlabel = "V", ylabel='Frequency', legend=True)
        plt.axis(plot_range)
        
        # 5.2. CDF Comaprison
        ax2 = fig.add_subplot(1,3,2)
        plot(x, y_ecdf,'o', alpha=0.8, label='Data')
        plot(x, y_cdf_gmm,'-', color='black',label='GMM')
        plot(x, y_cdf_weibull,'--', color='black',label='Weibull')
        plt.gca().set_xlim(right = max_speed)
        plt_configure(xlabel = "V", ylabel='P', legend=True)
        
        # 5.3. Weibull Comparison
#         ax3 = fig.add_subplot(1,3,3)
#         plot(log(x), log(-log(1-y_ecdf)),'o', alpha=0.8, label='Data')
#         plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black', label='GMM')
#         plot(log(x), log(-log(1-y_cdf_weibull)),'--',color='black',label='Weibull')
#         plt.gca().set_xlim(right = log(max_speed+1))
#         plt_configure(xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})
        
        curves = {'direction': angle, 'datasize': data_size, 'weight': direction_prob, 'x': x, 
                  'gmm_pdf': y_gmm, 'gmm_cdf': y_cdf_gmm,
                  'weibull_pdf': y_weibull, 'weibull_cdf': y_cdf_weibull, 'ecdf': y_ecdf,
                  'max_cdf_diff_gmm': cdf_diff.max(), 'max_cdf_diff_weibull': cdf_diff_weibull.max(), 
                  'r_square_gmm': R_square_gmm, 'r_square_weibull': R_square_weibull}
        curve_collection.append(curves)
        
        plt.tight_layout()
        plt.show()
        print('%s (%s - %s) degree' % (angle, start_angle, end_angle))
        print('data size:', len(sub_df), 'weight', len(sub_df)/len(df))
        print('GMM', 'Weibull')
        print('R square', R_square_gmm,  R_square_weibull)
        print('max diff:', cdf_diff.max(), cdf_diff_weibull.max(), 
              'speed value:', x[cdf_diff.argmax()], x[cdf_diff_weibull.argmax()], 'y gmm', y_cdf_gmm[cdf_diff.argmax()])
        print(' ')
    return curve_collection
In [79]:
%%time
if len(effective_column) == 16:
    rebinned_angle = 22.5
else: 
    rebinned_angle = 20
    
curve_collection = model_data_comparison(df, SECTOR_LENGTH, rebinned_angle)
5.0 (-5.0 - 15.0) degree
data size: 1208 weight 0.0332306338028169
GMM Weibull
R square 0.956723843044 0.935972361096
max diff: 0.0425582979042 0.0463744773093 speed value: 2.42093519951 2.42093519951 y gmm 0.142723860818
 
25.0 (15.0 - 35.0) degree
data size: 1869 weight 0.051413952464788734
GMM Weibull
R square 0.880500096161 0.936612159001
max diff: 0.094037887573 0.0650478976747 speed value: 4.59697390934 4.59697390934 y gmm 0.538920913925
 
45.0 (35.0 - 55.0) degree
data size: 2275 weight 0.0625825264084507
GMM Weibull
R square 0.952533606799 0.97565363768
max diff: 0.0485438303857 0.0802976294208 speed value: 6.013932862 6.013932862 y gmm 0.738708916867
 
65.0 (55.0 - 75.0) degree
data size: 2182 weight 0.060024207746478875
GMM Weibull
R square 0.982014283447 0.978012876598
max diff: 0.0483750436379 0.0506473371303 speed value: 4.77688074446 5.45929227938 y gmm 0.509878251704
 
85.0 (75.0 - 95.0) degree
data size: 3411 weight 0.0938325264084507
GMM Weibull
R square 0.961442770434 0.961985327864
max diff: 0.0578585831436 0.0641031492664 speed value: 6.679242051 5.19496603966 y gmm 0.847363127266
 
105.0 (95.0 - 115.0) degree
data size: 2718 weight 0.07476892605633803
GMM Weibull
R square 0.901558690596 0.958401367174
max diff: 0.0926965916773 0.0892725600103 speed value: 4.21132941262 4.21132941262 y gmm 0.38154917727
 
125.0 (115.0 - 135.0) degree
data size: 2549 weight 0.07011993838028169
GMM Weibull
R square 0.953635430845 0.944636129389
max diff: 0.0642413661877 0.111416088785 speed value: 3.144664528 6.28932905599 y gmm 0.199196250456
 
145.0 (135.0 - 155.0) degree
data size: 1271 weight 0.03496368838028169
GMM Weibull
R square 0.946320956594 0.952519919787
max diff: 0.0364597818215 0.130964029404 speed value: 6.46073898181 5.02501920807 y gmm 0.790196996613
 
165.0 (155.0 - 175.0) degree
data size: 1248 weight 0.03433098591549296
GMM Weibull
R square 0.942572250814 0.948631985422
max diff: 0.0443507029358 0.0858274976087 speed value: 4.22950913636 4.22950913636 y gmm 0.510937758603
 
185.0 (175.0 - 195.0) degree
data size: 929 weight 0.02555567781690141
GMM Weibull
R square 0.905300842902 0.942883789666
max diff: 0.0935721096031 0.0698079107952 speed value: 4.14382860276 4.14382860276 y gmm 0.578117879633
 
205.0 (195.0 - 215.0) degree
data size: 889 weight 0.02445532570422535
GMM Weibull
R square 0.956076185606 0.946763097069
max diff: 0.0307216185469 0.097053547368 speed value: 3.28951688177 3.75944786488 y gmm 0.506961171104
 
225.0 (215.0 - 235.0) degree
data size: 882 weight 0.024262764084507043
GMM Weibull
R square 0.914661104875 0.966058286128
max diff: 0.0767804892861 0.0427839674295 speed value: 4.70042074349 3.13361382899 y gmm 0.88063536457
 
245.0 (235.0 - 255.0) degree
data size: 737 weight 0.02027398767605634
GMM Weibull
R square 0.857163323282 0.989912521509
max diff: 0.0857367773879 0.0178756702104 speed value: 1.81283638278 4.22995155983 y gmm 0.205986424783
 
265.0 (255.0 - 275.0) degree
data size: 1228 weight 0.03378080985915493
GMM Weibull
R square 0.919155187369 0.985007005276
max diff: 0.0694441949014 0.0648220523226 speed value: 4.49430885217 3.93252024565 y gmm 0.722538657442
 
285.0 (275.0 - 295.0) degree
data size: 2167 weight 0.05961157570422535
GMM Weibull
R square 0.942446589971 0.958439833396
max diff: 0.0563496472957 0.0830815157576 speed value: 4.37180077706 4.37180077706 y gmm 0.451726033369
 
305.0 (295.0 - 315.0) degree
data size: 4149 weight 0.11413402288732394
GMM Weibull
R square 0.991953107313 0.990236288502
max diff: 0.0129132740474 0.0747770698904 speed value: 7.45498720926 4.74408276953 y gmm 0.907827711261
 
325.0 (315.0 - 335.0) degree
data size: 3596 weight 0.09892165492957747
GMM Weibull
R square 0.959484633792 0.967400480372
max diff: 0.0503408126658 0.0434886085656 speed value: 4.99431601846 4.99431601846 y gmm 0.587590221817
 
345.0 (335.0 - 355.0) degree
data size: 2586 weight 0.07113776408450705
GMM Weibull
R square 0.961989994681 0.95895874508
max diff: 0.0602748692268 0.0942410111824 speed value: 2.92109614974 3.65137018717 y gmm 0.204899772552
 
Wall time: 45 s
In [80]:
diff_df = pd.DataFrame(curve_collection) 

gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.r_square_gmm, diff_df.r_square_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="$\ R^2$", xlabel='Direction'+dir_unit_text)
ylim = min(plt.gca().get_ylim()[0],0.75)
plt.gca().set_ylim(top=1, bottom=ylim)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.9473742728899047 0.9632921004644883
In [81]:
gmm_mean, weibull_mean = plot_sectoral_comparison(diff_df.max_cdf_diff_gmm, diff_df.max_cdf_diff_weibull, 
                                                  diff_df.direction, diff_df.datasize)
plt_configure(ylabel="K-S", xlabel='Direction'+dir_unit_text)
ylim = max(plt.gca().get_ylim()[1],0.25)
plt.gca().set_ylim(top=ylim, bottom=0)
plt.tight_layout()
print(gmm_mean, weibull_mean)
0.05574508751195179 0.07415373641838202
In [82]:
# Compare direction weight with previous figure
display(dir_fig)

6.4 Insufficient-fit Sector Investigation

6.4.1 Data Variability, by Bootstrap (Resampling)

In [83]:
angle =  max_diff_angle = diff_df.ix[diff_df['max_cdf_diff_gmm'].idxmax()]['direction']
incre = rebinned_angle
In [84]:
FRACTION = 1

# Select data from observation
start_angle, end_angle = angle-incre/2, angle+incre/2
angle_radian, incre_radian = radians(angle), radians(incre)  
sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)
In [85]:
x = arange(0, sub_max_speed, 0.5)
_, y_weibull, y_cdf_weibull, weibull_params, y_ecdf = fit_weibull_and_ecdf(sub_df.speed, x)
_, y_gmm, y_cdf_gmm, direction_prob = gmm_integration_in_direction(f, angle_radian-incre_radian/2, angle_radian+incre_radian/2, x)

fig = plt.figure(figsize=(10,1.9))
ax1 = fig.add_subplot(1,3,1)   
ax2 = fig.add_subplot(1,3,2)   
ax3 = fig.add_subplot(1,3,3)   

# 1. Data
bins=arange(0, sub_max_speed)
sub_df['speed'].hist(ax=ax1, bins=bins, alpha=0.5, label='Data', normed=True)  

# 2. GMM
ax1.plot(x, y_gmm,'-', color='black', label='GMM')
ax2.plot(x, y_cdf_gmm,'-', color = 'black', label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color = 'black',label='GMM')

# 3. Weilbull 
ax1.plot(x, y_weibull,'--',color='black',label='Weibull')
ax2.plot(x, y_cdf_weibull,'--',label='Weibull')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)),'--',label='Weibull')

# 4. Data Resampled
count_collection = []
for i in range(1,100):
    sub_df_resampled = sub_df.sample(frac=FRACTION, replace=True)    
    resampled_count, _ = np.histogram(sub_df_resampled['speed'], bins=bins, normed=True) 
    count_collection.append(resampled_count)
    
    ecdf = sm.distributions.ECDF(sub_df_resampled.speed)
    y_ecdf = ecdf(x) 
    ax2.plot(x, y_ecdf,':', label='Data Resampled')
    ax3.plot(log(x), log(-log(1-y_ecdf)),':', label='Data Resampled')
    if i == 1: 
#         plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
#         plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)",legend={'loc':'best'})
        plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
        plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)",legend={'loc':'best'})

print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
count_collection = np.array(count_collection)
mx, mn = np.max(count_collection,0), np.min(count_collection,0)
ax1.plot(bins[1:]-0.5, mx, ':', color='blue')
ax1.plot(bins[1:]-0.5, mn, ':', color='blue', label='Resample limit')
ax1.set_ylim(bottom = 0)
# plt_configure(ax=ax1, xlabel='$V$',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax1, xlabel='V', ylabel='Frequency',legend={'loc':'best'})
ax1.locator_params(axis='y', nbins=5)
ax2.locator_params(axis='y', nbins=5)
ax3.locator_params(axis='y', nbins=5)
plt.tight_layout()
diff = abs(y_ecdf - y_cdf_gmm)
print(diff.max(), x[diff.argmax()], y_cdf_gmm[diff.argmax()])
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:17: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:22: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:34: RuntimeWarning: divide by zero encountered in log
25.0 (15.0 - 35.0) Degree Speed Distribution
0.0884206951512 5.0 0.622654746261

6.4.2 Time Variability

In [86]:
fig_time_variability_3d = plt.figure()
ax1 = fig_time_variability_3d.gca(projection='3d')

fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# 3. Data
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
for start_time in range(2001, 2015, 5):
    end_time = start_time + 4 
    df_other_years = df_all_years[str(start_time):str(end_time)]
    df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
    if len(df_other_years_at_angle) > 0 :
        
        ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
        y_ecdf = ecdf(x)
        ax2.plot(x, y_ecdf,':', label = start_time)
        ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = start_time)
        
        count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
                                       bins=arange(0, sub_max_speed_other_year))
        ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
                color=next(prop_cycle), alpha=0.8)
        x_3d = start_time*np.ones_like(x)
        ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
        ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
ax1.set_ylim(bottom = 0)
ax1.set_zlabel('Frequency')
plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

ax1.set_zlim(bottom = 0)
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:10: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:11: RuntimeWarning: divide by zero encountered in log
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:24: RuntimeWarning: divide by zero encountered in log
25.0 (15.0 - 35.0) Degree Speed Distribution
In [87]:
# fig_time_variability_3d = plt.figure()
# ax1 = fig_time_variability_3d.gca(projection='3d')

# fig_time_variability_cdf,ax2 = plt.subplots(figsize=(3,1.8))
# fig_time_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

# ax2.plot(x, y_cdf_gmm,'-', color='black', label = 'GMM')
# ax2.plot(x, y_cdf_weibull,'--', label='Weibull')

# ax3.plot(log(x), log(-log(1-y_cdf_gmm)),'-', color='black',label='GMM')
# ax3.plot(log(x), log(-log(1-y_cdf_weibull)), '--', label='Weibull')

# # 3. Data
# prop_cycle=iter(mpl.rcParams['axes.color_cycle'])
# for start_time in range(2001, 2015, 5):
#     end_time = start_time + 4 
# #     df_other_years = df_all_years.query('(date >= @start_time) & (date < @end_time)')
#     df_other_years = df_all_years[str(start_time):str(end_time)]
#     df_other_years_at_angle, sub_max_speed_other_year = select_df_by_angle(df_other_years, start_angle, end_angle)
#     if len(df_other_years_at_angle) > 0 :
        
#         ecdf = sm.distributions.ECDF(df_other_years_at_angle.speed)
#         y_ecdf = ecdf(x)
#         ax2.plot(x, y_ecdf,':', label = time_label)
#         ax3.plot(log(x), log(-log(1-y_ecdf)),':', label = time_label)
        
#         title = '%s - %s' %(start_time, time_label+4)
#         count, division = np.histogram(df_other_years_at_angle['speed'], normed=True,
#                                        bins=arange(0, sub_max_speed_other_year))
#         ax1.bar(left=division[:-1], height=count, zs=start_time, zdir='x', 
#                 color=next(prop_cycle), alpha=0.8)
#         x_3d = time_label*np.ones_like(x)
#         ax1.plot(x_3d, x, y_gmm, '-', color='black', label='GMM'  if start_time == 2011 else '')
#         ax1.plot(x_3d, x, y_weibull, '--', color='blue', label='Weibull' if start_time == 2011 else '')
        
# print('%s (%s - %s) Degree Speed Distribution' % (angle, start_angle, end_angle))
# ax1.set_ylim(bottom = 0)
# ax1.set_zlabel('Frequency')
# plt_configure(ax=ax1, xlabel='Time',ylabel='V', legend=True)
# # plt_configure(ax=ax2, xlabel = "$V$", ylabel='$P$', legend={'loc':'best'})
# # plt_configure(ax=ax3, xlabel="ln($V$)", ylabel="ln(-ln(1-$P$)", legend={'loc':'best'})
# plt_configure(ax=ax2, xlabel = "V", ylabel='P', legend={'loc':'best'})
# plt_configure(ax=ax3, xlabel="ln(V)", ylabel="ln(-ln(1-P)", legend={'loc':'best'})

# ax1.set_zlim(bottom = 0)
# align_figures()

6.4.3 Adjacent Sector Variability

In [88]:
incre = rebinned_angle
angle_group = [max_diff_angle-incre, max_diff_angle, max_diff_angle+incre]
In [89]:
fig_adjecent_variability_3d = plt.figure()
ax1 = fig_adjecent_variability_3d.gca(projection='3d')
fig_adjecent_variability_cdf, ax2 = plt.subplots(figsize=(3,1.8))
fig_adjecent_variability_weibull, ax3 = plt.subplots(figsize=(3,1.8))

legend_3d = False
prop_cycle=iter(mpl.rcParams['axes.color_cycle'])

curve_df = pd.DataFrame(curve_collection)

for angle in angle_group:
    curves = curve_df.query('direction == @angle%360').T.to_dict()
    curves = curves[list(curves)[0]]
    data_size, x =  curves['datasize'], curves['x']
    y_gmm, y_cdf_gmm =  curves['gmm_pdf'], curves['gmm_cdf'] 
    y_weibull, y_cdf_weibull, y_cdf = curves['weibull_pdf'],  curves['weibull_cdf'], curves['ecdf']

    linestyle = '-' if angle == max_diff_angle else ':'
    alpha = 0.7 if angle == max_diff_angle else 0.3

    ax2.plot(x, y_gmm*data_size, linestyle, label=angle)        
    ax3.plot(x, y_weibull*data_size, linestyle, label=angle)

    start_angle, end_angle = angle-incre/2, angle+incre/2
    sub_df, sub_max_speed = select_df_by_angle(df, start_angle, end_angle)

    x_3d = angle*np.ones_like(x)
    ax1.plot(x_3d, x, y_gmm*data_size, color='black', label='GMM')
    ax1.plot(x_3d, x, y_weibull*data_size, color='blue', linestyle='--',label='Weibull')

    count, division = np.histogram(sub_df['speed'], bins=arange(0, sub_max_speed))
    ax1.bar(left=division[:-1], height=count, zs=angle, zdir='x', color=next(prop_cycle), alpha=0.8)

    if legend_3d == False:
        ax1.legend()
        legend_3d = True
        
plt_configure(ax=ax1, xlabel='Direction', ylabel='Speed')   
plt_configure(ax=ax2, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
plt_configure(ax=ax3, xlabel='V',ylabel='Frequency',legend={'loc':'best'})
ax1.set_zlabel('Frequency')
ax1.set_zlim(bottom = 0)
ylim = max(ax1.get_ylim()[1],ax3.get_ylim()[1])
ax2.set_ylim(bottom = 0, top=ylim)
ax3.set_ylim(bottom = 0, top=ylim)

print(max_diff_angle) 
print('GMM, Weibull, Histogram')
align_figures()
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
25.0
GMM, Weibull, Histogram

7. Result Variability & Cross-Validation

In [90]:
if 'bandwidth' not in globals():
    bandwidth = DEFAULT_BANDWDITH    
if 'FIT_METHOD' not in globals():
    FIT_METHOD = 'square_error'       
if 'KDE_KERNEL' not in globals():
    KDE_KERNEL = 'gaussian'
    
config = {'bandwidth': bandwidth, 
          'fitting_range': FITTING_RANGE,
          'fit_limit': fit_limit,
          'kde_kernel': KDE_KERNEL}

print(bandwidth, FIT_METHOD)
0.6 square_error

7.1 Variability of the Result

In [91]:
%%time
results = Parallel(n_jobs=-1)(delayed(resampled_fitting)(df, FIT_METHOD, NUMBER_OF_GAUSSIAN, config) for i in range(10))                        
for result in results:
    display(pretty_print_gmm(result['gmm']))
    fig,ax = plt.subplots(figsize=(3.5,3.5))
    plot_gmm_ellipses(result['gmm'],ax=ax, xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text)
    plt.show()
    
    display(gof_df(result['gmm_pdf_result'], result['kde_result']))
    display(gof_df(result['gmm_pdf_result'], kde_result))
    print('')
weight mean_x mean_y sig_x sig_y corr
1 0.414 -0.278 3.765 2.834 2.065 -0.006
2 0.343 3.372 -1.963 1.609 2.923 0.272
3 0.242 -0.108 -1.948 3.180 1.987 -0.512
GMM Plot Result
0.414236967922 [[-0.2781772   3.76540801]] [ 2.06538129  2.83358869] -90.523501008
0.343499159897 [[ 3.37199547 -1.96287064]] [ 1.5255047   2.96731302] 168.374934371
0.242263872181 [[-0.10783608 -1.94833513]] [ 1.60006909  3.3913694 ] -113.210067697
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.012 0.078 2.709747e-07 0.031 0.153
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.058 2.848840e-07 0.033 0.157

weight mean_x mean_y sig_x sig_y corr
1 0.403 -0.289 3.821 2.838 1.979 0.013
2 0.351 3.384 -1.766 1.626 2.951 0.249
3 0.246 -0.142 -1.939 3.136 2.017 -0.521
GMM Plot Result
0.403494671051 [[-0.28923898  3.821276  ]] [ 1.97893618  2.83835073] -89.0163125006
0.3507600323 [[ 3.38380721 -1.76577121]] [ 1.55415515  2.98947863] 169.244150337
0.24574529665 [[-0.14155813 -1.93857203]] [ 1.60325466  3.36635092] -114.423043303
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.014 0.135 2.878189e-07 0.033 0.158
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.015 0.061 2.860810e-07 0.033 0.157

weight mean_x mean_y sig_x sig_y corr
1 0.427 -0.099 3.778 2.892 2.082 -0.043
2 0.344 3.322 -2.158 1.676 2.767 0.230
3 0.230 -0.513 -1.826 2.946 2.016 -0.531
GMM Plot Result
0.426563403272 [[-0.09907271  3.77793705]] [ 2.07760372  2.89505466] -93.6472132122
0.343798469248 [[ 3.32247804 -2.15808918]] [ 1.60764829  2.80758447] 168.10373671
0.22963812748 [[-0.51264689 -1.82559389]] [ 1.56989801  3.20574595] -116.901551349
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.062 2.855565e-07 0.033 0.157
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.017 0.075 2.962927e-07 0.033 0.160

weight mean_x mean_y sig_x sig_y corr
1 0.428 -0.039 3.761 2.902 2.073 -0.031
2 0.334 3.307 -2.075 1.650 2.746 0.241
3 0.238 -0.322 -1.915 3.172 2.045 -0.553
GMM Plot Result
0.428452165074 [[-0.0394816   3.76052852]] [ 2.07150507  2.90300578] -92.5500373133
0.333708537776 [[ 3.30738957 -2.07471955]] [ 1.57710546  2.7891833 ] 167.786895242
0.23783929715 [[-0.32167122 -1.91512216]] [ 1.57547246  3.42928709] -115.340609454
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.012 0.109 2.933881e-07 0.034 0.160
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.013 0.068 2.862953e-07 0.033 0.158

weight mean_x mean_y sig_x sig_y corr
1 0.406 -0.238 3.823 2.809 2.037 0.018
2 0.331 3.379 -1.824 1.603 2.936 0.246
3 0.263 0.021 -1.944 3.245 2.021 -0.507
GMM Plot Result
0.406420193971 [[-0.23750922  3.82275599]] [ 2.0360284   2.80964367] -88.4456024214
0.330793254529 [[ 3.37946515 -1.82449633]] [ 1.53476098  2.97262778] 169.539556066
0.2627865515 [[ 0.02133319 -1.94380201]] [ 1.63633819  3.45479235] -112.950859218
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.013 0.044 2.939800e-07 0.032 0.160
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.049 2.823632e-07 0.032 0.156

weight mean_x mean_y sig_x sig_y corr
1 0.416 -0.245 3.757 2.835 2.026 0.005
2 0.357 3.370 -2.002 1.651 2.926 0.239
3 0.227 -0.390 -1.813 2.956 1.917 -0.488
GMM Plot Result
0.415704427384 [[-0.24478405  3.75698442]] [ 2.02564485  2.83466673] -89.5789198982
0.357188610341 [[ 3.37007545 -2.00227637]] [ 1.58282947  2.96305392] 169.226563485
0.227106962275 [[-0.39048869 -1.81346927]] [ 1.56775636  3.1553892 ] -113.7646466
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.016 0.079 2.996040e-07 0.034 0.161
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.015 0.076 2.844983e-07 0.033 0.157

weight mean_x mean_y sig_x sig_y corr
1 0.423 -0.173 3.755 2.916 2.051 0.002
2 0.366 3.335 -2.145 1.686 2.807 0.270
3 0.211 -0.440 -1.780 2.961 1.807 -0.502
GMM Plot Result
0.423119706391 [[-0.17302571  3.754988  ]] [ 2.05076604  2.91582455] -89.8767176332
0.36567445395 [[ 3.33482166 -2.14517238]] [ 1.59312497  2.8611953 ] 166.545268812
0.211205839659 [[-0.43994244 -1.78027419]] [ 1.47332259  3.14020281] -112.150595278
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.978 0.014 0.155 3.053331e-07 0.033 0.163
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.013 0.164 2.871349e-07 0.033 0.158

weight mean_x mean_y sig_x sig_y corr
1 0.408 -0.199 3.827 2.845 2.011 -0.001
2 0.350 3.375 -1.827 1.639 2.837 0.238
3 0.242 -0.214 -2.033 3.067 2.079 -0.522
GMM Plot Result
0.408070357667 [[-0.19940972  3.82698245]] [ 2.01052045  2.84494373] -90.0742685891
0.349607476685 [[ 3.37536681 -1.82691313]] [ 1.57117094  2.87556245] 168.802859448
0.242322165648 [[-0.21404736 -2.03262134]] [ 1.63526393  3.32435015] -116.321637899
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.014 0.184 2.723922e-07 0.032 0.154
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.015 0.043 2.851677e-07 0.033 0.157

weight mean_x mean_y sig_x sig_y corr
1 0.419 -0.198 3.765 2.858 2.087 0.030
2 0.347 3.367 -1.919 1.620 2.931 0.251
3 0.234 -0.086 -1.977 3.091 1.919 -0.519
GMM Plot Result
0.418753784482 [[-0.19842539  3.76462716]] [ 2.08489161  2.85936744] -87.3088608194
0.346955975112 [[ 3.36661392 -1.91861447]] [ 1.54800161  2.97018946] 169.11547608
0.234290240407 [[-0.08584994 -1.97728133]] [ 1.53726533  3.29688824] -113.175738321
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.073 2.818100e-07 0.032 0.156
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.012 0.095 2.806936e-07 0.032 0.156

weight mean_x mean_y sig_x sig_y corr
1 0.424 -0.162 3.743 2.898 2.039 -0.017
2 0.349 3.362 -2.077 1.652 2.861 0.265
3 0.227 -0.272 -1.873 3.015 1.875 -0.500
GMM Plot Result
0.424084890149 [[-0.16157632  3.74254442]] [ 2.03816425  2.89850764] -91.3548384153
0.348792872894 [[ 3.36173994 -2.07694837]] [ 1.56740763  2.90831895] 167.685092144
0.227122236956 [[-0.2719564  -1.87347942]] [ 1.52744214  3.20568806] -112.714816391
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.980 0.014 0.097 2.789442e-07 0.032 0.156
R_square K_S Chi_square MSE RMSE / Max RMSE / Mean
0 0.979 0.014 0.101 2.802560e-07 0.032 0.156
Wall time: 12 s

7.2 Cross-validation, to select the number of Gaussian

In [92]:
%%time
from sklearn.cross_validation import train_test_split, KFold

## 5-fold cross validation
gaussian_number_range = arange(1,6)
CV_result_train_all,CV_result_test_all =[],[]
number_of_fold = 4
print('Number of train/test dataset', len(df)*(number_of_fold-1)/number_of_fold, len(df)/number_of_fold) 

for number_of_gaussian in gaussian_number_range:
    print( '  ')
    print('Number of gaussian', number_of_gaussian)
    
    kf = KFold(len(df), n_folds=number_of_fold, shuffle=True) 

    CV_result = Parallel(n_jobs=-1)(delayed(fit_per_fold)(df, train_index, test_index, FIT_METHOD, number_of_gaussian, config) for train_index, test_index in kf)                        

    CV_result_train, CV_result_test = list(zip(*CV_result))
    CV_result_train, CV_result_test = list(CV_result_train), list(CV_result_test)
        
    CV_result_train_all.append(CV_result_train)
    CV_result_test_all.append(CV_result_test)
    
    print('Train')
    pretty_pd_display(CV_result_train)
    print('Test')
    pretty_pd_display(CV_result_test)
D:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Number of train/test dataset 27264.0 9088.0
  
Number of gaussian 1
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.122398 0.074840 0.000003 0.114793 0.539453 0.752251
1 0.122183 0.075203 0.000003 0.109292 0.543974 0.749388
2 0.125692 0.074884 0.000003 0.112027 0.546035 0.746492
3 0.125651 0.075523 0.000003 0.115031 0.549437 0.743158
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.137077 0.081271 0.000004 0.110832 0.579663 0.717183
1 0.134351 0.074527 0.000003 0.124924 0.549177 0.742431
2 0.122802 0.075113 0.000003 0.114713 0.539682 0.753359
3 0.124355 0.070300 0.000003 0.105688 0.528192 0.764769
  
Number of gaussian 2
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.055941 0.033139 8.261400e-07 0.054195 0.267619 0.939675
1 0.054116 0.033517 8.277967e-07 0.055263 0.267966 0.938964
2 0.052578 0.032610 8.267774e-07 0.056517 0.267606 0.938393
3 0.054438 0.032826 8.193619e-07 0.055461 0.266495 0.939859
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.062845 0.029542 9.053416e-07 0.062065 0.280089 0.931818
1 0.063512 0.037842 8.769554e-07 0.057618 0.275420 0.935787
2 0.055252 0.036756 9.291716e-07 0.055464 0.284123 0.934210
3 0.059163 0.030862 9.024610e-07 0.056948 0.279718 0.933037
  
Number of gaussian 3
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.063280 0.013722 2.675181e-07 0.031905 0.152242 0.980203
1 0.054828 0.014094 2.800179e-07 0.031722 0.155871 0.979406
2 0.051197 0.013095 2.873651e-07 0.032552 0.157736 0.978968
3 0.097711 0.013331 2.858822e-07 0.032834 0.157474 0.978863
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.067151 0.013959 3.938516e-07 0.036892 0.184906 0.971489
1 0.407048 0.015689 3.626726e-07 0.038605 0.177054 0.973266
2 0.062568 0.016686 3.264574e-07 0.035180 0.168513 0.975557
3 0.054013 0.013793 3.291250e-07 0.034164 0.168730 0.976095
  
Number of gaussian 4
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.031003 0.007444 2.123866e-07 0.028224 0.135657 0.984259
1 0.037518 0.008599 2.355452e-07 0.029705 0.142811 0.982827
2 0.035393 0.010149 2.241996e-07 0.028802 0.139557 0.983351
3 0.083414 0.011669 2.269595e-07 0.028807 0.140216 0.983337
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.042262 0.006908 2.953966e-07 0.032625 0.160115 0.978670
1 0.039695 0.013038 2.700677e-07 0.031251 0.153257 0.979566
2 0.048576 0.009245 2.837589e-07 0.032630 0.156327 0.979657
3 0.071337 0.016529 3.308829e-07 0.035882 0.169523 0.975442
  
Number of gaussian 5
Train
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.061833 0.008580 1.568392e-07 0.024014 0.116556 0.988437
1 0.028166 0.008896 1.617307e-07 0.024122 0.118398 0.988087
2 0.048148 0.008013 1.494477e-07 0.023799 0.113807 0.989013
3 0.042980 0.007998 1.379067e-07 0.022870 0.109389 0.989831
Test
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
0 0.052235 0.013631 2.143065e-07 0.028629 0.136446 0.984319
1 0.033117 0.012692 2.887258e-07 0.034381 0.158218 0.978835
2 0.070651 0.013977 2.539747e-07 0.029796 0.148415 0.981277
3 0.065885 0.019409 2.750438e-07 0.030974 0.154178 0.979892
Wall time: 41.1 s
In [93]:
train_scores_mean, train_scores_std = generate_mean_std_gof(CV_result_train_all)
print('Train gof mean, std')
display(train_scores_mean)

test_scores_mean, test_scores_std = generate_mean_std_gof(CV_result_test_all)
print('Test gof mean, std')
display(test_scores_mean)
Train gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.123981 0.075112 3.423259e-06 0.112786 0.544725 0.747822
2 0.054268 0.033023 8.250190e-07 0.055359 0.267422 0.939223
3 0.066754 0.013561 2.801958e-07 0.032253 0.155831 0.979360
4 0.046832 0.009465 2.247727e-07 0.028884 0.139560 0.983444
5 0.045282 0.008372 1.514811e-07 0.023701 0.114538 0.988842
Test gof mean, std
Chi_square K_S MSE RMSE / Max RMSE / Mean R_square
1 0.129646 0.075303 3.483026e-06 0.114039 0.549179 0.744435
2 0.060193 0.033750 9.034824e-07 0.058024 0.279838 0.933713
3 0.147695 0.015032 3.530267e-07 0.036210 0.174800 0.974102
4 0.050468 0.011430 2.950265e-07 0.033097 0.159806 0.978334
5 0.055472 0.014927 2.580127e-07 0.030945 0.149314 0.981081
In [94]:
prop_cycle=mpl.rcParams['axes.color_cycle']
gaussian_number_range = train_scores_mean.index
for column, column_name in zip(['R_square','K_S','Chi_square'],["$\ R^2$", "K-S", "$\widetilde{\chi^2} $"]):
    plot(gaussian_number_range, train_scores_mean[column],
             '--', label = 'training', color=prop_cycle[0])
    plt.fill_between(gaussian_number_range, 
                     train_scores_mean[column] - train_scores_std[column],
                     train_scores_mean[column] + train_scores_std[column], 
                     alpha=0.2, color=prop_cycle[0])
    
    plot(gaussian_number_range, test_scores_mean[column],
             '-', label = 'test',color=prop_cycle[1])
    plt.fill_between(gaussian_number_range, 
                 test_scores_mean[column] - test_scores_std[column],
                 test_scores_mean[column] + test_scores_std[column], 
                 alpha=0.2,color=prop_cycle[1])
    plt.xticks(gaussian_number_range)
    print(column)
    plt.locator_params(axis='y', nbins=5)
    plt_configure(xlabel='Number of Gaussian Distributions', ylabel=column_name, 
                  figsize=(3,2), legend={'loc':'best'})
    if column == 'R_square':
        plt.gca().set_ylim(top=1)
    if column == 'K_S' or column == 'Chi_square':
        plt.gca().set_ylim(bottom=0)
    plt.show()
R_square
D:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py:938: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
K_S
Chi_square
In [95]:
fig = plt.figure(figsize=(4.2,2.4))
ax1 = fig.add_subplot(1,2,1) 
plot_2d_prob_density(X, Y, kde_Z, ax=ax1,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax1.grid(False)
ax2 = fig.add_subplot(1,2,2) 
plot_2d_prob_density(X, Y, pdf_Z, ax=ax2,
                     xlabel='x'+speed_unit_text, ylabel='y'+speed_unit_text, colorbar=False)
ax2.grid(False)
ax2.get_yaxis().set_visible(False)
In [ ]:
for fig in [fig_hist, fig_kde, fig_em, fig_gmm]:
    display(fig)
for fig in [fig_time_variability_3d, fig_time_variability_cdf, fig_time_variability_weibull, 
            fig_adjecent_variability_3d, fig_adjecent_variability_cdf, fig_adjecent_variability_weibull,]:
    display(fig)
In [ ]:
import time
save_notebook()
time.sleep(3)
location_name = get_location_name(file_path)
print(location_name)
current_file = 'GMM.ipynb'
output_file = './output_HTML/'+location_name+'.html' 

output_HTML(current_file, output_file)